home *** CD-ROM | disk | FTP | other *** search
- /* file build_indices.c ... by ^z, 870820-0913-...
- *
- * revised 870930-871007 to allow more user options on keeping/discarding
- * punctuation, etc. -- ideas based on Bill Hole's suggestions
- *
- * contains subroutine to build indices for each chunk of input document
- * (database) text file for program qndxr ...
- *
- * Strategy is as outlined in qndxr: take in a big chunk of the doc_file,
- * generate the pointers to each word in the buffer as the buffer contents
- * are converted into appropriate (all caps, delimiters filtered into
- * spaces) form for sorting; sort the pointers in memory; and then write
- * out to disk the pointers and keys in the ndxr.c format for *.k and *.p
- * files.
- *
- * Allocate space for the doc and ptr buffers here so as to maximize use
- * of available memory ... note that we need to have room for the doc
- * buffer, for a ptr buffer that might (in the worst case of a file full
- * of 1-letter words) be twice as long as the doc buffer, and also space
- * for two standard zbuffers to accumulate the output *.k and *.p file
- * info in before sending it to disk.
- *
- * Note that for speed, while they are being sorted the pointers just point
- * directly to the key strings in the input buffer; they must be converted
- * into true offset pointers relative to the 0th byte of the document file
- * as they are written to disk in the *.p file! Make sure that all of
- * the delimiters in the document/database buffer are converted into
- * '\0' characters so that string comparison functions will work right.
- *
- * Also note that to avoid edge effects at the end of the buffer, an extra
- * amount of space is required at the end of the buffer, of length
- * KEY_LENGTH, to accomodate the end of the last word in the buffer.
- *
- * Use static local variables in the function here to keep track of where
- * we are in the document file from one chunk to the next, what chunk
- * number we are on now, etc.
- *
- * Give the user a chance to interrupt operations (in the Macintosh
- * version of this program) at intervals here, as long as
- * there are time-consuming I/O or sorting or scanning operations
- * to be done ...
- */
-
- #include <stdio.h>
- #include <unix.h>
- #include <storage.h>
- #include <strings.h>
- #include <ctype.h>
- #include <proto.h>
- #include "qndxr.2.h"
-
- int build_indices ()
- {
- static int pass_number = 0;
- long doc_bufsiz, offset, load_doc_buffer(), nwords,
- ftell();
- extern long zbufsiz;
- extern FILE *doc_file;
- char *doc, **ptr, *malloc(), *mlalloc(), *calloc(), *clalloc();
- void zqsort(), write_sorted_files();
-
- doc_bufsiz = 2 * NMERGE * zbufsiz / 3;
- DEBUG ("--allocating doc buffer of size %ld\n", doc_bufsiz + KEY_LENGTH);
- doc = make_buf (doc_bufsiz + KEY_LENGTH);
-
- DEBUG ("--allocating ptr buffer of size %ld\n", doc_bufsiz * 2);
- ptr = (char **)make_buf (doc_bufsiz * 2);
-
- #ifdef LIGHTSPEED
- check_interrupt ();
- #endif
-
- offset = ftell (doc_file);
- DEBUG ("--loading doc buffer beginning at offset %ld\n", offset);
- nwords = load_doc_buffer (doc, doc_bufsiz, ptr);
-
- if (nwords == 0)
- {
- DEBUG ("--Building done ... now freeing doc & ptr buffers\n", NULL);
- free (doc);
- free ((char *)ptr);
- return (FALSE);
- }
-
- printf ("Index subfile #%d contains %ld words...\n", pass_number,
- nwords);
-
- #ifdef LIGHTSPEED
- check_interrupt ();
- #endif
-
- DEBUG ("--sorting ptr array\n", NULL);
- zqsort (ptr, ptr + nwords);
-
- #ifdef LIGHTSPEED
- check_interrupt ();
- #endif
-
- DEBUG ("--writing sorted keys and ptrs to disk\n", NULL);
- write_sorted_files (doc, ptr, nwords, pass_number, offset);
-
- #ifdef LIGHTSPEED
- check_interrupt ();
- #endif
-
- DEBUG ("--freeing doc & ptr buffers\n", NULL);
- free (doc);
- free ((char *)ptr);
-
- ++pass_number;
- return (TRUE);
- }
-
-